import scanpy as sc, anndata as ad, numpy as np, pandas as pd
from scipy import sparse
from anndata import AnnData
import warnings
import socket
import plotly.express as px
from matplotlib import pylab
import sys
import yaml
import os
import matplotlib.pyplot as plt
import scvelo as scv
from pybiomart import Server
import rpy2
from pandas.api.types import CategoricalDtype
import rpy2.robjects as ro
import seaborn as sns
from matplotlib_venn import venn2, venn2_circles, venn2_unweighted
from matplotlib_venn import venn3, venn3_circles
from matplotlib import pyplot as plt
import plotly.graph_objects as go
import ipyparams
warnings.filterwarnings('ignore')
outdir="./outdir"
FinaLeaf="/Neurons"
output_basename = "09A_Neurons_pBulk_bySegment.DEA"
mappingDict = {}
categoriesOrder = ["piece1","piece2","piece3","distal","medial","proximal"]
groupingCovariate = "group"
PseudooReplicates_per_group = 10
markers = "./data/resources/F_T.markers.scored.tsv"
totalPath = outdir+FinaLeaf+"/5A_Neurons_pBulk.bySegment."+str(PseudooReplicates_per_group)+"PRs.tsv"
adataPath = outdir+FinaLeaf+"/5A_Neurons_pBulk.bySegment."+str(PseudooReplicates_per_group)+"PRs.h5ad"
badhuriMarkers = ["PAX6","DDIT3","NEUROG2","CNBP","HMGB2","ZNF707","CAMTA1","SUB1","THYN1","NEUROG1","HMGB3","NFIX","BCL11A","TCF4","NR2F1"]
%load_ext rpy2.ipython
%%R
library(edgeR)
library(org.Hs.eg.db)
library(AnnotationDbi)
library(stats)
library(topGO)
R[write to console]: Loading required package: limma
R[write to console]: Loading required package: AnnotationDbi
R[write to console]: Loading required package: stats4
R[write to console]: Loading required package: BiocGenerics
R[write to console]: Loading required package: parallel
R[write to console]:
Attaching package: ‘BiocGenerics’
R[write to console]: The following objects are masked from ‘package:parallel’:
clusterApply, clusterApplyLB, clusterCall, clusterEvalQ,
clusterExport, clusterMap, parApply, parCapply, parLapply,
parLapplyLB, parRapply, parSapply, parSapplyLB
R[write to console]: The following object is masked from ‘package:limma’:
plotMA
R[write to console]: The following objects are masked from ‘package:stats’:
IQR, mad, sd, var, xtabs
R[write to console]: The following objects are masked from ‘package:base’:
anyDuplicated, append, as.data.frame, basename, cbind, colnames,
dirname, do.call, duplicated, eval, evalq, Filter, Find, get, grep,
grepl, intersect, is.unsorted, lapply, Map, mapply, match, mget,
order, paste, pmax, pmax.int, pmin, pmin.int, Position, rank,
rbind, Reduce, rownames, sapply, setdiff, sort, table, tapply,
union, unique, unsplit, which.max, which.min
R[write to console]: Loading required package: Biobase
R[write to console]: Welcome to Bioconductor
Vignettes contain introductory material; view with
'browseVignettes()'. To cite Bioconductor, see
'citation("Biobase")', and for packages 'citation("pkgname")'.
R[write to console]: Loading required package: IRanges
R[write to console]: Loading required package: S4Vectors
R[write to console]:
Attaching package: ‘S4Vectors’
R[write to console]: The following object is masked from ‘package:base’:
expand.grid
R[write to console]:
R[write to console]: Loading required package: graph
R[write to console]: Loading required package: GO.db
R[write to console]:
R[write to console]: Loading required package: SparseM
R[write to console]:
Attaching package: ‘SparseM’
R[write to console]: The following object is masked from ‘package:base’:
backsolve
R[write to console]:
groupGOTerms: GOBPTerm, GOMFTerm, GOCCTerm environments built.
R[write to console]:
Attaching package: ‘topGO’
R[write to console]: The following object is masked from ‘package:IRanges’:
members
%matplotlib inline
sc.settings.verbosity = 3 # verbosity: errors (0), warnings (1), info (2), hints (3)
sc.logging.print_header()
sc.settings.set_figure_params(dpi=50, facecolor='white')
pylab.rcParams['figure.figsize'] = (10, 10)
scanpy==1.8.1 anndata==0.7.6 umap==0.4.6 numpy==1.20.2 scipy==1.6.3 pandas==1.2.4 scikit-learn==0.24.2 statsmodels==0.13.1 python-igraph==0.9.8 louvain==0.7.1 pynndescent==0.5.5
hostRoot = "-".join(socket.gethostname().split('-')[0:2])
with open(os.path.expanduser('~')+"/paths_config.yaml", 'r') as f:
paths = yaml.load(f, Loader=yaml.FullLoader)
#indir=paths["paths"]["indir"][hostRoot]
#projectBaseDir=paths["paths"]["projectBaseDir"][hostRoot]
adata = sc.read_h5ad(adataPath)
adata.obs[groupingCovariate] = adata.obs[groupingCovariate].astype(CategoricalDtype(categories=categoriesOrder, ordered=True))
total = pd.read_csv(totalPath, sep="\t", index_col=0)
#adata = adata[adata.obs["group"].isin(RelevantAreas)]
total = total[adata.obs_names]
edgeR_topTags = ro.r['topTags']
edgeR_glmLRT = ro.r['glmLRT']
edgeR_glmQLFTest = ro.r['glmQLFTest']
as_data_frame = ro.r['as.data.frame']
rprint = rpy2.robjects.globalenv.find("print")
DEGs = {}
TestCov = groupingCovariate
adata.obs.group.value_counts()
piece1 10 piece2 10 piece3 10 distal 10 medial 10 proximal 10 Name: group, dtype: int64
obs = adata.obs[[TestCov,"pseudoreplicate"]].copy()
obs["pseudoreplicate"] = obs.index.tolist()
obs["TestCov"] = obs[TestCov].astype(str)
#obs["TestCov"] = obs[TestCov]
totalRelevant = total[obs.index]
totalRelevant.head()
| distal_0 | distal_1 | distal_2 | distal_3 | distal_4 | distal_5 | distal_6 | distal_7 | distal_8 | distal_9 | ... | proximal_0 | proximal_1 | proximal_2 | proximal_3 | proximal_4 | proximal_5 | proximal_6 | proximal_7 | proximal_8 | proximal_9 | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| MIR1302-2HG | 0.0 | 0.0 | 0.0 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.0 | 0.0 | ... | 0.564133 | 0.000000 | 0.000000 | 0.00000 | 0.000000 | 0.000000 | 0.0 | 0.00000 | 0.000000 | 0.000000 |
| FAM138A | 0.0 | 0.0 | 0.0 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.0 | 0.0 | ... | 0.000000 | 0.000000 | 0.000000 | 0.00000 | 0.000000 | 0.000000 | 0.0 | 0.00000 | 0.000000 | 0.000000 |
| OR4F5 | 0.0 | 0.0 | 0.0 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.0 | 0.0 | ... | 0.000000 | 0.000000 | 0.000000 | 0.00000 | 0.000000 | 0.000000 | 0.0 | 0.00000 | 0.000000 | 0.000000 |
| AL627309.1 | 0.0 | 0.0 | 0.0 | 1.823126 | 0.924461 | 0.903551 | 2.596238 | 0.845117 | 0.0 | 0.0 | ... | 1.213359 | 2.400798 | 1.462998 | 2.86158 | 1.283984 | 0.706691 | 0.0 | 4.56084 | 3.235815 | 0.613853 |
| AL627309.3 | 0.0 | 0.0 | 0.0 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.0 | 0.0 | ... | 0.000000 | 0.000000 | 0.000000 | 0.00000 | 0.000000 | 0.000000 | 0.0 | 0.00000 | 0.000000 | 0.000000 |
5 rows × 60 columns
minCounts = totalRelevant[totalRelevant > 0].min(axis = 1).min()
#Universe 1 count in at least 5% of samples per group
groupThreshold = round(pd.get_dummies(obs[TestCov]).T.sum(axis = 1) *0.05)
bolMatrix = (np.matrix(np.dot(pd.get_dummies(obs[TestCov]).T, (totalRelevant > 0).astype(int).T)) - np.matrix(groupThreshold).T > 0)
bolVect = (bolMatrix.sum(axis = 0) >= 1).A1
totalRelevant = totalRelevant.loc[bolVect]
len(totalRelevant)
27149
universe = totalRelevant.index.tolist()
levelsToMap = adata.obs[groupingCovariate].cat.categories.tolist()
levelsToMap
['piece1', 'piece2', 'piece3', 'distal', 'medial', 'proximal']
%%R -i obs -i totalRelevant -i levelsToMap -o dds
mmVector<- factor(obs$TestCov, levels = c(levelsToMap))
mm <- model.matrix(~mmVector )
row.names(mm) <- colnames(totalRelevant)
dds <- DGEList(totalRelevant, group = obs$TestCov, genes = rownames(totalRelevant))
#Ennotate with entrez genes
anno <- select(org.Hs.eg.db, keys=rownames(dds$counts),columns=c("ENTREZID","SYMBOL"),keytype="SYMBOL")
colnames(anno) <- c("genes","entrez")
anno <- anno[!duplicated(anno$genes, fromLast=T), ]
dds$genes$entrez <- merge(x = data.frame(dds$genes), y = anno, by = "genes", all.x = TRUE)$entrez
dim(dds)
R[write to console]: 'select()' returned 1:many mapping between keys and columns
[1] 27149 60
%%R
mm
(Intercept) mmVectorpiece2 mmVectorpiece3 mmVectordistal
distal_0 1 0 0 1
distal_1 1 0 0 1
distal_2 1 0 0 1
distal_3 1 0 0 1
distal_4 1 0 0 1
distal_5 1 0 0 1
distal_6 1 0 0 1
distal_7 1 0 0 1
distal_8 1 0 0 1
distal_9 1 0 0 1
medial_0 1 0 0 0
medial_1 1 0 0 0
medial_2 1 0 0 0
medial_3 1 0 0 0
medial_4 1 0 0 0
medial_5 1 0 0 0
medial_6 1 0 0 0
medial_7 1 0 0 0
medial_8 1 0 0 0
medial_9 1 0 0 0
piece1_0 1 0 0 0
piece1_1 1 0 0 0
piece1_2 1 0 0 0
piece1_3 1 0 0 0
piece1_4 1 0 0 0
piece1_5 1 0 0 0
piece1_6 1 0 0 0
piece1_7 1 0 0 0
piece1_8 1 0 0 0
piece1_9 1 0 0 0
piece2_0 1 1 0 0
piece2_1 1 1 0 0
piece2_2 1 1 0 0
piece2_3 1 1 0 0
piece2_4 1 1 0 0
piece2_5 1 1 0 0
piece2_6 1 1 0 0
piece2_7 1 1 0 0
piece2_8 1 1 0 0
piece2_9 1 1 0 0
piece3_0 1 0 1 0
piece3_1 1 0 1 0
piece3_2 1 0 1 0
piece3_3 1 0 1 0
piece3_4 1 0 1 0
piece3_5 1 0 1 0
piece3_6 1 0 1 0
piece3_7 1 0 1 0
piece3_8 1 0 1 0
piece3_9 1 0 1 0
proximal_0 1 0 0 0
proximal_1 1 0 0 0
proximal_2 1 0 0 0
proximal_3 1 0 0 0
proximal_4 1 0 0 0
proximal_5 1 0 0 0
proximal_6 1 0 0 0
proximal_7 1 0 0 0
proximal_8 1 0 0 0
proximal_9 1 0 0 0
mmVectormedial mmVectorproximal
distal_0 0 0
distal_1 0 0
distal_2 0 0
distal_3 0 0
distal_4 0 0
distal_5 0 0
distal_6 0 0
distal_7 0 0
distal_8 0 0
distal_9 0 0
medial_0 1 0
medial_1 1 0
medial_2 1 0
medial_3 1 0
medial_4 1 0
medial_5 1 0
medial_6 1 0
medial_7 1 0
medial_8 1 0
medial_9 1 0
piece1_0 0 0
piece1_1 0 0
piece1_2 0 0
piece1_3 0 0
piece1_4 0 0
piece1_5 0 0
piece1_6 0 0
piece1_7 0 0
piece1_8 0 0
piece1_9 0 0
piece2_0 0 0
piece2_1 0 0
piece2_2 0 0
piece2_3 0 0
piece2_4 0 0
piece2_5 0 0
piece2_6 0 0
piece2_7 0 0
piece2_8 0 0
piece2_9 0 0
piece3_0 0 0
piece3_1 0 0
piece3_2 0 0
piece3_3 0 0
piece3_4 0 0
piece3_5 0 0
piece3_6 0 0
piece3_7 0 0
piece3_8 0 0
piece3_9 0 0
proximal_0 0 1
proximal_1 0 1
proximal_2 0 1
proximal_3 0 1
proximal_4 0 1
proximal_5 0 1
proximal_6 0 1
proximal_7 0 1
proximal_8 0 1
proximal_9 0 1
attr(,"assign")
[1] 0 1 1 1 1 1
attr(,"contrasts")
attr(,"contrasts")$mmVector
[1] "contr.treatment"
%%R -i minCounts
#keep <- filterByExpr(dds, min.count = minCounts, min.total.count = minCounts*5)
#dds <- dds[keep,,keep.lib.sizes=FALSE]
dim(dds)
[1] 27149 60
%%R
dds <- calcNormFactors(dds)
dds <- estimateGLMRobustDisp(dds, mm)
fit <- glmFit(dds, mm)
levelsToMap
['piece1', 'piece2', 'piece3', 'distal', 'medial', 'proximal']
%%R -o piece2_vs_piece1 -o piece3_vs_piece1 -o distal_vs_piece1 -o medial_vs_piece1 -o proximal_vs_piece1 -o proximal_vs_distal -o medial_vs_distal
#-o Mid_vs_early -o Late_vs_mid
topn=10000
piece2_vs_piece1 = topTags(glmLRT(fit, coef=2), adjust.method = "bonferroni", n = topn, sort.by = "logFC")
piece3_vs_piece1 = topTags(glmLRT(fit, coef=3), adjust.method = "bonferroni", n = topn, sort.by = "logFC")
distal_vs_piece1 = topTags(glmLRT(fit, coef=4), adjust.method = "bonferroni", n = topn, sort.by = "logFC")
medial_vs_piece1 = topTags(glmLRT(fit, coef=5), adjust.method = "bonferroni", n = topn, sort.by = "logFC")
proximal_vs_piece1 = topTags(glmLRT(fit, coef=6), adjust.method = "bonferroni", n = topn, sort.by = "logFC")
proximal_vs_distal = topTags(glmLRT(fit, contrast=c(0,0,0,-1,0,1)), adjust.method = "bonferroni", n = topn, sort.by = "logFC")
medial_vs_distal = topTags(glmLRT(fit, contrast=c(0,0,0,-1,1,0)), adjust.method = "bonferroni", n = topn, sort.by = "logFC")
DEGsDict = {"piece2_vs_piece1":piece2_vs_piece1,
"piece3_vs_piece1":piece3_vs_piece1,
"distal_vs_piece1":distal_vs_piece1,
"medial_vs_piece1":medial_vs_piece1,
"proximal_vs_piece1":proximal_vs_piece1,
"proximal_vs_distal":proximal_vs_distal,
"medial_vs_distal":medial_vs_distal,
}
markersDF = pd.read_csv(markers, header=None, sep = "\t", names=["name","area","score"])
temporalMarkers = markersDF.loc[markersDF["area"] == "Temporal","name"]
frontalMarkers = markersDF.loc[markersDF["area"] == "Frontal","name"]
LOGCFTHRESHOLD = 1.5
for k in list(DEGsDict.keys()):
DEGsDict[k] = rpy2.robjects.pandas2ri.rpy2py(as_data_frame(DEGsDict[k]))
DEGsDict[k]["-logPVal"] = -np.log10(DEGsDict[k].PValue)
DEGsDict[k]["significant"] = "notSignificant"
DEGsDict[k].loc[(DEGsDict[k]["logFC"] < -LOGCFTHRESHOLD) & (DEGsDict[k]["FWER"] < 0.01),"significant"] = "Down"
DEGsDict[k].loc[(DEGsDict[k]["logFC"] > LOGCFTHRESHOLD) & (DEGsDict[k]["FWER"] < 0.01),"significant"] = "Up"
#DEGsDict[k]["knownMarker"] = "unlisted"
#DEGsDict[k].loc[set(frontalMarkers).intersection(DEGsDict[k].index),"knownMarker"] = "frontal"
#DEGsDict[k].loc[set(temporalMarkers).intersection(DEGsDict[k].index),"knownMarker"] = "temporal"
color_discrete_map = {'notSignificant': 'white', 'Up': 'red', 'Down': 'blue'}
fig = px.scatter(DEGsDict[k], x="logFC", y="-logPVal",
#hover_data=DEGsDict[k][["genes","knownMarker"]],
hover_data=DEGsDict[k][["genes"]],
width=600, height=600,
color = "significant", color_discrete_map=color_discrete_map,
#symbol="knownMarker",
title=k,
template="simple_white")
fig.update_traces(mode='markers', marker_line_width=1, marker_size=10)
fig.add_hline(y=DEGsDict[k].loc[DEGsDict[k]["FWER"] - 0.01 < 0,"-logPVal" ].min(),
line_color="black",line_width=2, line_dash="dash")
fig.add_vline(x=LOGCFTHRESHOLD,
line_color="black",line_width=2, line_dash="dash")
fig.add_vline(x=-LOGCFTHRESHOLD,
line_color="black",line_width=2, line_dash="dash")
fig.show()
contrast = "proximal_vs_distal"
GOname = "GO_proximal_vs_distal.tsv"
GOlist = DEGsDict[contrast][DEGsDict[contrast]["significant"] != "notSignificant"].index.tolist()
%%R -i GOlist -i universe -o results
Universe <- universe
Temporal_vs_PFCVector <- factor(as.integer(Universe%in%GOlist))
names(Temporal_vs_PFCVector) <- Universe
BPann <- inverseList(topGO::annFUN.org(whichOnto="BP", feasibleGenes=names(Temporal_vs_PFCVector),
mapping="org.Hs.eg.db", ID="symbol"))
GOdata <- new('topGOdata', ontology="BP", allGenes=Temporal_vs_PFCVector, annotat=annFUN.gene2GO,
gene2GO=BPann, nodeSize=15)
GOtest <- runTest(GOdata, algorithm='weight01', statistic='fisher')
results <- topGO::GenTable(GOdata, Statistics=GOtest, topNodes=length(GOtest@score), numChar=100)
R[write to console]: Building most specific GOs ..... R[write to console]: ( 12008 GO terms found. ) R[write to console]: Build GO DAG topology .......... R[write to console]: ( 15807 GO terms and 36827 relations. ) R[write to console]: Annotating nodes ............... R[write to console]: ( 15250 genes annotated to the GO terms. ) R[write to console]: -- Weight01 Algorithm -- the algorithm is scoring 2662 nontrivial nodes parameters: test statistic: fisher R[write to console]: Level 19: 1 nodes to be scored (0 eliminated genes) R[write to console]: Level 18: 1 nodes to be scored (0 eliminated genes) R[write to console]: Level 17: 3 nodes to be scored (20 eliminated genes) R[write to console]: Level 16: 8 nodes to be scored (27 eliminated genes) R[write to console]: Level 15: 17 nodes to be scored (55 eliminated genes) R[write to console]: Level 14: 27 nodes to be scored (224 eliminated genes) R[write to console]: Level 13: 51 nodes to be scored (525 eliminated genes) R[write to console]: Level 12: 96 nodes to be scored (1064 eliminated genes) R[write to console]: Level 11: 159 nodes to be scored (3276 eliminated genes) R[write to console]: Level 10: 243 nodes to be scored (5132 eliminated genes) R[write to console]: Level 9: 304 nodes to be scored (6806 eliminated genes) R[write to console]: Level 8: 367 nodes to be scored (8700 eliminated genes) R[write to console]: Level 7: 425 nodes to be scored (10690 eliminated genes) R[write to console]: Level 6: 400 nodes to be scored (12658 eliminated genes) R[write to console]: Level 5: 289 nodes to be scored (13748 eliminated genes) R[write to console]: Level 4: 165 nodes to be scored (14590 eliminated genes) R[write to console]: Level 3: 85 nodes to be scored (14860 eliminated genes) R[write to console]: Level 2: 20 nodes to be scored (14983 eliminated genes) R[write to console]: Level 1: 1 nodes to be scored (15069 eliminated genes)
FilteredResults = results[results["Statistics"].astype(float) < 0.01]
FilteredResults.to_csv(outdir+FinaLeaf+"/"+output_basename+"."+GOname, sep = "\t")
FilteredResults.Term.tolist()[:20]
['forebrain regionalization', 'axon guidance', 'cell fate determination', 'positive regulation of neuron differentiation', 'forebrain neuron differentiation', 'neuron migration', 'ureter development', 'nephron tubule formation', 'cerebral cortex development', 'synapse maturation', 'negative regulation of transcription by RNA polymerase II', 'neuron fate commitment', 'nephric duct development', 'dentate gyrus development', 'glial cell differentiation', 'metanephric renal vesicle morphogenesis', 'positive regulation of branching involved in ureteric bud morphogenesis', 'positive regulation of transcription by RNA polymerase II', 'regulation of epithelial cell differentiation involved in kidney development', 'mesenchymal to epithelial transition']
UP = DEGsDict[contrast][DEGsDict[contrast]["significant"] != "notSignificant"].sort_values("logFC", ascending = False)
UP = UP[UP.logFC > 0].index.tolist()
DOWN = DEGsDict[contrast][DEGsDict[contrast]["significant"] != "notSignificant"].sort_values("logFC", ascending = True)
DOWN = DOWN[DOWN.logFC < 0].index.tolist()
print("Upregulated in contrast "+contrast)
sc.pl.violin(adata,
keys = UP[:4] ,
groupby=groupingCovariate)
Upregulated in contrast proximal_vs_distal
print("Downregolated in contrast "+contrast)
sc.pl.violin(adata,
keys = DOWN[:4] ,
groupby=groupingCovariate)
Downregolated in contrast proximal_vs_distal
for k in DEGsDict.keys():
DEGsDict[k][DEGsDict[k].significant != "notSignificant" ].sort_values("logFC", ascending = False).to_csv(outdir+FinaLeaf+"/"+output_basename+"."+ k + ".tsv", sep = "\t")
DEGsDict[k][DEGsDict[k].significant != "notSignificant" ].sort_values("logFC", ascending = False).to_excel(outdir+FinaLeaf+"/"+output_basename+"."+ k + ".xls")